!! Copyright (C) 2009,2010,2011,2012  Marco Restelli
!!
!! This file is part of:
!!   FEMilaro -- Finite Element Method toolkit
!!
!! FEMilaro is free software; you can redistribute it and/or modify it
!! under the terms of the GNU General Public License as published by
!! the Free Software Foundation; either version 3 of the License, or
!! (at your option) any later version.
!!
!! FEMilaro is distributed in the hope that it will be useful, but
!! WITHOUT ANY WARRANTY; without even the implied warranty of
!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
!! General Public License for more details.
!!
!! You should have received a copy of the GNU General Public License
!! along with FEMilaro; If not, see <http://www.gnu.org/licenses/>.
!!
!! author: Marco Restelli                   <marco.restelli@gmail.com>

!>\brief
!! GMRES solver for linear systems.
!!
!! \n
!!
!! This is an implementation of the GMRES solver for linear systems
!! with the modified Gram-Schmidt orthogonalization. Implementation
!! as in <a
!! href="http://www.netlib.org/templates/templates.pdf"><em>Templates
!! for the Solution of Linear Systems: Building Blocks for Iterative
!! Methods</em></a>.
!!
!! We now give a detailed description of the GMRES algorithm. As in
!! any Krylov method, given the initial guess \f$x^{(0)}\f$, we look
!! for a solution \f$x^{(k)}\in W_k\f$, with
!! \f{equation}{
!!  W_k = \left\{ v = x^{(0)}+y, \quad y\in K_k(A;r^{(0)}) \right\}
!! \f}
!! and
!! \f{equation}{
!!  K_k(A;r^{(0)}) = \left\{ \underbrace{
!!  r^{(0)},\,Ar^{(0)},\ldots,A^{k-1}r^{(0)} }_{k\,\,{\rm elements}}
!!  \right\}.
!! \f}
!! Denoting by \f$V_k\f$ an orthonormal basis of \f$K_k\f$, it follows
!! that
!! \f{equation}{
!!   x^{(k)} = x^{(0)} + V_kz^{(k)},
!! \f}
!! where \f$z^{(k)}\f$ are the \f$k\f$ coefficients of the linear
!! combination yielding \f$x^{(k)}\f$. The corresponding residual is
!! \f{equation}{
!!   r^{(k)} = r^{(0)} - AV_kz^{(k)},
!! \f}
!! so that \f$r^{(k)}\f$ belongs to an affine \f$k\f$-dimensional
!! subspace of the \f$(k+1)\f$-dimensional space
!! \f$K_{k+1}(A;r^{(0)})\f$, and the exact solution can be reached
!! when such a subspace contains the origin. In the GMRES method,
!! \f$z^{(k)}\f$ is obtained solving in the least square sense the
!! overdetermined system
!! \f{equation}{
!!   r^{(k)} = r^{(0)} - AV_kz^{(k)} = 0,
!! \f}
!! which amounts to minimizing \f$\|r^{(k)}\|\f$. Such a least square
!! problem (in principle) leads to the normal equation
!! \f{equation}{
!!   V_k^TA^TAV_kz^{(k)} = V_k^TA^Tr^{(0)}.
!! \f}
!! In practice, however, once the orthogonal basis \f$V_k\f$ is
!! available, it is more convenient to use a carefully adapted version
!! of the QR factorization to factorize the rectangular matrix
!! \f$AV_k\f$. The algorithm is thus composed of the
!! following steps:
!! <ol>
!!  <li> compute the orthogonal basis \f$V_k\f$
!!  <li> compute the QR factorization of \f$AV_k\f$
!!  <li> estimate the residual and if it is satisfactory compute
!!  \f$x^{(k)}\f$ and exit.
!! </ol>
!! An important aspect of the method is that the above steps can be
!! performed incremetally with respect to the dimension \f$k\f$ of the
!! Krylov space.
!!
!! In the following, we detail the resulting algorithm.
!!
!! \section arnoldi Arnoldi algorithm to compute the orthogonal basis
!!
!! The Arnoldi algorithm essentially is the Gram-Schmidt procedure
!! (either in its plain or modified form) applied to the basis
!! \f$\left\{A^jr^{(0)}\right\}\f$ of the Krylov space
!! \f$K(A;r^{(0)})\f$.
!!
!! To build an orthonormal basis \f$\left\{v_j\right\}\f$ from a
!! generic basis \f$\left\{w_j\right\}\f$, in the Gram-Schmidt
!! algorithm we compute \f$v_j\f$ by
!! \f{equation}{
!!   \tilde{v}_j = w_j - \sum_{i=1}^{j-1} (w_j\cdot v_i)v_i, \qquad
!!   v_j = \tilde{v}_j/\|\tilde{v}_j\|.
!! \f}
!! In the modified version of the algorithm, we use, for
!! \f$i=1,\ldots,j-1\f$,
!! \f{equation}{
!!   \tilde{v}^{(1)}_j = w_j, \qquad
!!   \tilde{v}^{(i+1)}_j = \tilde{v}^{(i)}_j - (\tilde{v}^{(i)}_j\cdot
!!   v_i)v_i, \qquad
!!   v_j = \tilde{v}^{(j)}_j/\|\tilde{v}^{(j)}_j\|.
!! \f}
!! In fact, the sole difference is the computation of the scalar
!! products \f$(w_j\cdot v_i)\f$ (plain version) and
!! \f$(\tilde{v}^{(i)}_j\cdot v_i)\f$ (modified version). Notice
!! that:
!! <ol>
!!  <li> the two algorithms yield the <em>same</em> basis
!!  \f$\left\{v_j\right\}\f$
!!  <li> we have \f$ w_j\cdot v_i = \tilde{v}^{(p)}_j\cdot v_i, \f$
!!  for any \f$ 1\leq p\leq i\f$
!!  <li> the normalization factors are the same: \f$ \|\tilde{v}_j\| =
!!  \|\tilde{v}^{(j)}_j\|\f$.
!! </ol>
!!
!! In the case of the Krylov space, this read as follows:
!! <ul>
!!  <li> set \f$v_1=r^{(0)}/\|r^{(0)}\|\f$
!!  <li> for \f$j\geq2\f$, compute the following basis vectors by
!!  <ul>
!!   <li> setting \f$w_{j+1}=Av_j\f$
!!   <li> use the Gram-Schmidt algorithm to compute \f$v_{j+1}\f$ from
!!   \f$w_{j+1}\f$.
!!  </ul>
!! </ul>
!! The coefficients generated in each step of the orthonormalization
!! are collected in the (square) Hessenberg matrix
!! \f{equation}{
!!   H_k = \left[
!!    \begin{array}{ccccc}
!!      h_{11} & h_{12} & h_{13} & \ldots & h_{1k} \\[0.125mm]
!!      h_{21} & h_{22} & h_{23} & \ldots & h_{2k} \\[0.125mm]
!!        0    & h_{32} & h_{33} & \ldots & h_{3k} \\[0.125mm]
!!      \vdots & \ddots & \ddots & \ddots & \vdots \\[0.125mm]
!!        0    & \ldots & 0 & h_{kk-1} & h_{kk}
!!    \end{array}
!!   \right].
!! \f}
!! where
!! \f{equation}{
!!  h_{ij} = w_{j+1}\cdot v_i = \tilde{v}_{j+1}^{(i)}\cdot v_i =
!!  Av_j\cdot v_i, \qquad i=1,\ldots,j
!! \f}
!! and, for \f$j<k\f$,
!! \f{equation}{
!!  h_{j+1j} = \|\tilde{v}_{j+1}\| = \|\tilde{v}_{j+1}^{(j+1)}\|.
!! \f}
!! A second (rectangular) matrix is obtained adding
!! \f{equation}{
!!  h_{k+1k} = \|\tilde{v}_{k+1}\| = \|\tilde{v}_{k+1}^{(k+1)}\|,
!! \f}
!! so that
!! \f{equation}{
!!  \hat{H}_k = \left[
!!   \begin{array}{c}
!!     H_k \\
!!     \hline 
!!     \begin{array}{cccc}
!!       0 & \ldots & 0 & h_{k+1k}
!!     \end{array}
!!   \end{array}
!!  \right].
!! \f}
!! Two important relations are
!! \f{equation}{
!!  V_k^TAV_k = H_k, \qquad AV_k = V_{k+1}\hat{H}_k.
!! \f}
!! \note The way the coefficients \f$h_{ij}\f$ are collected in the
!! matrix \f$H\f$ is slightly different from what is done in the QR
!! factorization. This will be discussed in the next section.
!!
!! \section QR QR factorization
!!
!! Using the fact that \f$AV_k= V_{k+1}\hat{H}_k\f$, we can rewrite the
!! least square problem as
!! \f{equation}{
!!  V_{k+1}\hat{H}_k z^{(k)} = r^{(0)}
!! \f}
!! wich is almost a (reduced) QR factorization except for the first
!! subdiagonal of \f$\hat{H}_k\f$. Thus we can find \f$k\f$
!! rotation matrices (Givens matrices) \f$F_i\f$ such that
!! \f{equation}{
!!  \hat{H}_k = Q R
!! \f}
!! where \f$Q\f$ is \f$k+1\times k+1\f$ and \f$R\f$ is \f$k+1\times
!! k\f$ and
!! \f{equation}{
!!   Q = \prod_{i=k}^1 F^T_i.
!! \f}
!! Notice that the definition of the \f$F_i\f$ is such that \f$R =
!! F_k\ldots F_2F_1\hat{H}_k\f$.
!! The solutin of the least square problem is then
!! \f{equation}{
!!   \tilde{R}z^{(k)} = \tilde{Q}^T V_{k+1}^Tr^{(0)} = \tilde{Q}^T
!!   \|r^{(0)}\|e_{1,k+1},
!! \f}
!! where \f$e_{1,k+1}\f$ is the \f$(k+1)\f$-dimensional vector
!! \f$[1,0,\ldots,0]^T\f$, and the corresponding residual is
!! \f$\|r^{(0)}\|Q(:,k+1)^Te_{1,k+1}\f$.
!!
!! \section pseudocode Summary of the algorithm
!!
!! The complete GMRES algorithm can be written as follows (we use here
!! the modified Gram-Schmidt method).
!! <ul>
!!  <li> compute the first basis vector \f$v_1=
!!  \frac{r^{(0)}}{\|r^{(0)}\|}\f$ and store it as \f$V(:,1)\f$,
!!  <li> for \f$j=1,\ldots,k\f$
!!  <ul>
!!   <li> set \f$w_{j+1} = Av_j\f$
!!   <li> set \f$\tilde{v}_{j+1}^{(1)}= w_{j+1}\f$
!!   <li> for \f$i=1,j\f$
!!   <ul>
!!    <li> compute \f$\hat{H}(i,j) = \tilde{v}_{j+1}^{(i)} \cdot v_i \f$
!!    <li> compute \f$\tilde{v}_{j+1}^{(i+1)} = \tilde{v}_{j+1}^{(i)}
!!     -\hat{H}(i,j)v_i \f$
!!   </ul>
!!   <li> set \f$\hat{H}(j+1,j)=\|\tilde{v}_{j+1}^{(j+1)}\|\f$
!!   <li> compute the new basis vector \f$ v_{j+1} = \frac{
!!   \tilde{v}_{j+1}^{(j+1)}}{\hat{H}(j+1,j)} \f$
!!   <li> store the new basis vector \f$ v_{j+1}\f$ as \f$V(:,j+1)\f$
!!   <li> apply the previous rotations to the new column of
!!   \f$\hat{H}\f$ (in fact, \f$\hat{H}\f$ stores, in this
!!   implementation, the upper trapezioidal matrix \f$R\f$): for
!!   \f$i=1,j-1\f$
!!   <ul>
!!    <li> set \f$\hat{H}(i:i+1,j) = F_i \hat{H}(i:i+1,j)\f$
!!   </ul>
!!   <li> define the new rotation \f$F_j\f$
!!   <li> apply the new rotation \f$F_j\f$ to \f$\hat{H}(j:j+1,j)\f$,
!!   notice that this yields \f$\hat{H}(j+1,j)=0\f$
!!   <li> update the right hand side and evaluate the residual
!!  </ul>
!!  <li> solve the upper triangular system
!!  <li> assemble the solution.
!! </ul>
!!
!! \section parallel_execution Parallel execution
!! 
!! As discussed in \c c_stv, there are two kinds of distributed data:
!! those for which shared data are assumed to be copies of the same
!! value and those for which shared data are supposed to be added
!! together. For this implementation, we follow the general
!! specifications of \c mod_iterativesolvers_base, so that all the
!! variables \c x, \c r, \c w and \c v are of the first kind.
!<----------------------------------------------------------------------
module mod_gmres

!-----------------------------------------------------------------------

 use mod_messages, only: &
   mod_messages_initialized, &
   error,   &
   warning, &
   info

 use mod_kinds, only: &
   mod_kinds_initialized, &
   wp

 use mod_mpi_utils, only: &
   mod_mpi_utils_initialized, &
   mpi_logical, mpi_integer, wp_mpi, &
   mpi_bcast

 use mod_state_vars, only: &
   mod_state_vars_initialized, &
   c_stv

 use mod_iterativesolvers_base, only: &
   mod_iterativesolvers_base_initialized, &
   c_itpb

!-----------------------------------------------------------------------
 
 implicit none

!-----------------------------------------------------------------------

! Module interface

 public :: &
   mod_gmres_constructor, &
   mod_gmres_destructor,  &
   mod_gmres_initialized, &
   gmres

 private

!-----------------------------------------------------------------------

 ! Module types and parameters

! Module variables

 ! public members
 logical, protected ::               &
   mod_gmres_initialized = .false.

 character(len=*), parameter :: &
   this_mod_name = 'mod_gmres'

!-----------------------------------------------------------------------

contains

!-----------------------------------------------------------------------

 subroutine mod_gmres_constructor()
  character(len=*), parameter :: &
    this_sub_name = 'constructor'

   !Consistency checks ---------------------------
   if( (mod_messages_initialized.eqv..false.) .or. &
          (mod_kinds_initialized.eqv..false.) .or. &
      (mod_mpi_utils_initialized.eqv..false.) .or. &
     (mod_state_vars_initialized.eqv..false.) .or. &
(mod_iterativesolvers_base_initialized.eqv..false.) ) then
     call error(this_sub_name,this_mod_name, &
                'Not all the required modules are initialized.')
   endif
   if(mod_gmres_initialized.eqv..true.) then
     call warning(this_sub_name,this_mod_name, &
                  'Module is already initialized.')
   endif
   !----------------------------------------------

   mod_gmres_initialized = .true.
 end subroutine mod_gmres_constructor

!-----------------------------------------------------------------------
 
 subroutine mod_gmres_destructor()
  character(len=*), parameter :: &
    this_sub_name = 'destructor'
   
   !Consistency checks ---------------------------
   if(mod_gmres_initialized.eqv..false.) then
     call error(this_sub_name,this_mod_name, &
                'This module is not initialized.')
   endif
   !----------------------------------------------

   mod_gmres_initialized = .false.
 end subroutine mod_gmres_destructor

!-----------------------------------------------------------------------
 
 !> The GMRES solver
 subroutine gmres(x,sys , maxiterex,niter,res)
  !> linear system
  class(c_itpb), intent(in) :: sys
  !> solution (must contain the initial guess)
  class(c_stv), intent(inout) :: x
  !> true if reached m iterations
  logical, intent(out) :: maxiterex
  !> number of iterations (defined as the number of evaluation of
  !! the lhs which, due to the initial evaluation of the residual,
  !! is equal to the number of Arnoldi iterations +1)
  integer, intent(out) :: niter
  !> residual norms
  real(wp), allocatable, intent(out) :: res(:)
 
  logical :: &
    convergence, &
    breakdown,   &
    lbuf(2),     & ! logical mpi buffer
    master
  integer :: &
    m,           & ! maximum number of iterations
    ir,          & ! restart index
    niter_r,     & ! iterations of the current restart
    i,           & ! index for the Arnoldi loop
    k,           & ! index for the Gram Schmidt orthogonalization
    nkry,        & ! dimension of the Krylov space
    ierr           ! error variable
  real(wp) :: &
    tol,         & ! effective tolerance (absolute o relative)
    tol2,        &
    hki, hk1i,   &
    den
  real(wp), allocatable :: &
    rn2(:), & ! right hand side / two-norm of the residual
    h(:,:), & ! Hessemberg matrix
    c(:), s(:), & ! rotation matrices
    wres(:,:) ! working array to store the residuals
  class(c_stv), allocatable :: &
    r,   & ! residual
    w,   & ! new Krylov vector
    v(:)   ! Krylov basis
  logical, parameter :: debug = .true.
  character(len=*), parameter :: &
    this_sub_name = 'gmres'

   !--------------------------------------------------------------------
   ! 0) Initialization
   master = sys%mpi_id.eq.0

   m = sys%nmax
   allocate( r    , source=x )
   allocate( w    , source=x )
   allocate( v(m) , source=x )
   if(master) then
     allocate( rn2(m) , h(m,m-1) , c(m) , s(m) , wres(m,sys%rmax) )
   else
     allocate( rn2(1) , h(m,m-1) , c(m) , s(0) , wres(0,   0    ) )
   endif
   !--------------------------------------------------------------------

   if(master) niter = 0
   restart_do: do ir=1,sys%rmax

     !------------------------------------------------------------------
     ! 1) compute the preconditioned residual
     call sys%pres( r , x )
     rn2(1) = sqrt( r%scal( r ) ) ! used on all procs.

     if(master) then
       wres(1,ir) = rn2(1)
       convergence = rn2(1).eq.0.0_wp ! already done
     endif
     call mpi_bcast(convergence,1,mpi_logical,0,sys%mpi_comm,ierr)
     if(convergence) then
       if(master) then
         maxiterex = .false.
         allocate(res(0))
       endif
       return
     endif
     !------------------------------------------------------------------

     !------------------------------------------------------------------
     ! 2) compute the first vector of the Krylov space
     call v(1)%mlt( 1.0_wp/rn2(1) , r )
     !------------------------------------------------------------------

     !------------------------------------------------------------------
     ! 3) define the tolerance: can be absolute or relative
     if(master) then
       if(ir.eq.1) then
         if(sys%abstol) then
           tol = sys%tolerance
         else
           tol = sys%tolerance*rn2(1)
         endif
         tol2 = tol**2
       endif
     endif
     !------------------------------------------------------------------

     ! 4) Arnoldi loop
     arnoldi: do i=1,m-1

       !----------------------------------------------------------------
       ! 4.1) compute the next (i.e. i+1) Krylov vector
       call sys%pkry( w , v(i) )

       ! 4.2) Gram-Schmidt orthogonalization
       gs_orth: do k=1,i
         hki = w%scal( v(k) )
         call w%inlt( -hki , v(k) )
         if(master) h(k,i) = hki
       enddo gs_orth

       ! 4.3) new element for h (this is needed on all procs.)
       h(i+1,i) = sqrt( w%scal( w ) )
       !----------------------------------------------------------------

       if(master) then

         !--------------------------------------------------------------
         ! 4.4) check for breakdown
         breakdown = h(i+1,i).lt.tol2
         ! If there is no breakdown we can define v(i+1) = w/h(i+1,i).
         ! This will be done after the master process block, since it
         ! must be executed by all the processes.
         !--------------------------------------------------------------

         !--------------------------------------------------------------
         ! 4.5) apply the rotation matrices
         rotation: do k=1,i-1
           hki  = h(k  ,i)
           hk1i = h(k+1,i)
           h(k  ,i) =  c(k)*hki + s(k)*hk1i
           h(k+1,i) = -s(k)*hki + c(k)*hk1i
         enddo rotation

         ! 4.6) compute the new (i.e. i) rotation
         den = sqrt(h(i,i)**2 + h(i+1,i)**2)
         c(i) = h(i  ,i) / den
         s(i) = h(i+1,i) / den

         ! 4.7) complete applying the rotation matrices
         h(i,i) =  c(i)*h(i,i) + s(i)*h(i+1,i)
         ! we should also apply the rotation to h(i+1,i), which would be
         ! set to 0.0, however this value will never be used and we can
         ! skip setting it. In fact, h(i+1,i) retains its value, which
         ! will be used later to normalize w.

         ! 4.8) compute new residual norm
         rn2(i+1) = -s(i)*rn2(i)
         rn2(i  ) =  c(i)*rn2(i)

         wres(i+1,ir) = abs(rn2(i+1))
         !--------------------------------------------------------------

         !--------------------------------------------------------------
         ! 4.9) check whether we are done
         convergence = wres(i+1,ir).lt.tol

         lbuf(1) = convergence
         lbuf(2) = breakdown
       endif
       call mpi_bcast(lbuf,2,mpi_logical,0,sys%mpi_comm,ierr)
       if(.not.master) then
         convergence = lbuf(1)
         breakdown   = lbuf(2)
       endif
       !----------------------------------------------------------------

       !----------------------------------------------------------------
       ! 4.10) compute the new Krylov vector
       if(.not.breakdown) call v(i+1)%mlt( 1.0_wp/h(i+1,i) , w )
       !----------------------------------------------------------------

       !----------------------------------------------------------------
       ! 4.11) This step is used only for debug, because there is no
       ! need to compute the residual vector to know its module
       if(debug) call step_debug()
       !----------------------------------------------------------------

       if( breakdown .or. convergence ) exit arnoldi
     enddo arnoldi

     !------------------------------------------------------------------
     ! 5) update the counters
     if(master) then
       if((.not.breakdown).and.(.not.convergence)) then
         ! We are here because we have exceeded m. Notice that if the
         ! arnoldi loop has been terminated with the exit instruction,
         ! we have (at most) i=m-1, while if the loop has been
         ! terminated by the loop counter we have i=m (i is
         ! incremented after finishing the do loop). This is a problem
         ! because only the first m-1 columns h(i,i) have been set in
         ! the arnoldi loop, but we have i=m. We thus decrement i.
         i = i-1 
       endif
       nkry = i
       niter_r = nkry+1
       niter = niter + niter_r
     endif
     call mpi_bcast(nkry,1,mpi_integer,0,sys%mpi_comm,ierr)
     !------------------------------------------------------------------

     !------------------------------------------------------------------
     ! Steps 7) and 8): evaluation of the Krylov expansion
     call final_solve(nkry,h,c,rn2,master)
     !------------------------------------------------------------------

     !------------------------------------------------------------------
     ! 8) we have to decide whether to exit the restart loop
     if(breakdown.or.convergence) exit restart_do
   enddo restart_do

   !--------------------------------------------------------------------
   ! 9) check for convergence and diagnostics
   if(master) then
     if((.not.breakdown).and.(.not.convergence)) then
       ! The residual is still larger than the prescribed tolerance
       maxiterex = .true.
       ir = ir-1
     else
       maxiterex = .false.
     endif
     allocate( res(m*(ir-1)+niter_r) )
     do i=1,ir-1
       res( (i-1)*m+1 : i*m ) = wres(:,i)
     enddo
     res( (ir-1)*m+1 : (ir-1)*m+niter_r ) = wres(1:niter_r,ir)
   endif
   !--------------------------------------------------------------------

   !--------------------------------------------------------------------
   ! 10) clean-up
   deallocate( r , rn2 , v , w , h , c , s , wres )
   !--------------------------------------------------------------------

 contains

  subroutine final_solve(nk,h,c,rn2,master)
   logical, intent(in) :: master
   integer, intent(in) :: nk ! dimension of the Krylov space
   real(wp), intent(in) :: h(:,:), rn2(:)
   real(wp), intent(inout) :: c(:)

   integer :: i, k
   real(wp) :: ci
   ! This work array is introduced as a workarounf of
   ! http://gcc.gnu.org/bugzilla/show_bug.cgi?id=56691
   ! so that the inlv function can be called with a full array.
   class(c_stv), allocatable :: v_workaround(:)

     !------------------------------------------------------------------
     ! 6) compute the coefficient of the Krylov expansion (back sub.)
     if(master) then
       krylov: do i=nk,1,-1
         ! coefficients are stored in c for convenience
         ci = rn2(i)
         do k=nk,i+1,-1
           ci = ci - h(i,k)*c(k)
         enddo
         c(i) = ci/h(i,i)
       enddo krylov
     endif
     call mpi_bcast(c,nk,wp_mpi,0,sys%mpi_comm,ierr)
     !------------------------------------------------------------------

     !------------------------------------------------------------------
     ! 7) evaluate the Krylov expansion
     ! The nex line is the correct code
     !
     !  call x%inlv( c(1:nk) , v(1:nk) )
     !
     ! while this is the workaround to handle
     ! http://gcc.gnu.org/bugzilla/show_bug.cgi?id=56691
     allocate( v_workaround(nk) , source=x )
     do k=1,nk
       call v_workaround(k)%copy(v(k))
     enddo
     call x%inlv( c(1:nk) , v_workaround )
     deallocate( v_workaround )
     !------------------------------------------------------------------

  end subroutine final_solve

  subroutine step_debug()

   real(wp) :: res_norm
   real(wp), allocatable :: cloc(:)
   class(c_stv), allocatable :: xloc

    !------------------------------------------------------------------
    ! Evaluation of the Krylov expansion
    if(master) nkry = i
    call mpi_bcast(nkry,1,mpi_integer,0,sys%mpi_comm,ierr)

    allocate(cloc(m)); cloc = c
    ! x is modified in final_solve: we make a local copy xloc to
    ! restore the correct value after executing step_debug
    allocate( xloc , source=x )
    call final_solve(nkry,h,c,rn2,master)
    c = cloc; deallocate(cloc) ! restore the original values
    !-------------------------------------------------------------------

    !-------------------------------------------------------------------
    ! Compute the residual
    call sys%pres( r , x )
    res_norm = sqrt( r%scal( r ) )
    call x%copy(xloc); deallocate( xloc )
    !-------------------------------------------------------------------

    !-------------------------------------------------------------------
    ! Compare the results
    if(master) then
      write(*,*) 'Debug information for the GMRES solver:'
      write(*,*) '  restart        ', ir
      write(*,*) '  iteration      ', i
      write(*,*) '  residual norm  ', res_norm
      write(*,*) '  res. norm (est)', wres(i+1,ir)
    endif
    !-------------------------------------------------------------------

  end subroutine step_debug

 end subroutine gmres
 
!-----------------------------------------------------------------------

end module mod_gmres

